In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

sns.set_style('whitegrid')
plt.rcParams['font.size']=12
plt.rcParams['figure.figsize']=(15,5)

df=pd.read_csv(r'C:\Users\TajwarAbtahee\OneDrive - JCW Resourcing\Desktop\Python\practice\world-happiness-report-2021.csv')
df.head()
Out[1]:
Country name Regional indicator Ladder score Standard error of ladder score upperwhisker lowerwhisker Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption Ladder score in Dystopia Explained by: Log GDP per capita Explained by: Social support Explained by: Healthy life expectancy Explained by: Freedom to make life choices Explained by: Generosity Explained by: Perceptions of corruption Dystopia + residual
0 Finland Western Europe 7.842 0.032 7.904 7.780 10.775 0.954 72.0 0.949 -0.098 0.186 2.43 1.446 1.106 0.741 0.691 0.124 0.481 3.253
1 Denmark Western Europe 7.620 0.035 7.687 7.552 10.933 0.954 72.7 0.946 0.030 0.179 2.43 1.502 1.108 0.763 0.686 0.208 0.485 2.868
2 Switzerland Western Europe 7.571 0.036 7.643 7.500 11.117 0.942 74.4 0.919 0.025 0.292 2.43 1.566 1.079 0.816 0.653 0.204 0.413 2.839
3 Iceland Western Europe 7.554 0.059 7.670 7.438 10.878 0.983 73.0 0.955 0.160 0.673 2.43 1.482 1.172 0.772 0.698 0.293 0.170 2.967
4 Netherlands Western Europe 7.464 0.027 7.518 7.410 10.932 0.942 72.4 0.913 0.175 0.338 2.43 1.501 1.079 0.753 0.647 0.302 0.384 2.798
In [2]:
df.isnull().sum()
Out[2]:
Country name                                  0
Regional indicator                            0
Ladder score                                  0
Standard error of ladder score                0
upperwhisker                                  0
lowerwhisker                                  0
Logged GDP per capita                         0
Social support                                0
Healthy life expectancy                       0
Freedom to make life choices                  0
Generosity                                    0
Perceptions of corruption                     0
Ladder score in Dystopia                      0
Explained by: Log GDP per capita              0
Explained by: Social support                  0
Explained by: Healthy life expectancy         0
Explained by: Freedom to make life choices    0
Explained by: Generosity                      0
Explained by: Perceptions of corruption       0
Dystopia + residual                           0
dtype: int64
In [3]:
data_cols=['Country name','Regional indicator','Ladder score','Logged GDP per capita','Social support','Healthy life expectancy','Freedom to make life choices', 'Generosity','Perceptions of corruption']
In [4]:
df=df[data_cols].copy()
In [5]:
#top 20 countries by happniess rank
top=df.sort_values('Ladder score',ascending=False).head(20)
sns.barplot(data=top,x='Country name',y='Ladder score')
plt.xticks(rotation=30)
plt.ylim(6.75,8)
plt.show()

print(top['Regional indicator'].value_counts())
#5 scandanavian countries are in top 7
#majority of the countries are from Western Europe
Western Europe                  13
North America and ANZ            4
Middle East and North Africa     1
Central and Eastern Europe       1
Latin America and Caribbean      1
Name: Regional indicator, dtype: int64
In [6]:
#GDP vs Happniness
px.scatter(df,x='Logged GDP per capita',y='Ladder score',trendline='ols')
#as we can see both variables are strongly positively correlated
In [7]:
#lets explore other correlations in the data
df.corr().style.background_gradient(cmap='RdYlGn')
#Ladder score(happiness) has the strongest realtion with GDP per capita
#after that its life expsctancy and social support
#generoisity seems to be very low across the board
Out[7]:
Ladder score Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption
Ladder score 1.000000 0.789760 0.756888 0.768099 0.607753 -0.017799 -0.421140
Logged GDP per capita 0.789760 1.000000 0.785299 0.859461 0.432323 -0.199286 -0.342337
Social support 0.756888 0.785299 1.000000 0.723256 0.482930 -0.114946 -0.203207
Healthy life expectancy 0.768099 0.859461 0.723256 1.000000 0.461494 -0.161750 -0.364374
Freedom to make life choices 0.607753 0.432323 0.482930 0.461494 1.000000 0.169437 -0.401363
Generosity -0.017799 -0.199286 -0.114946 -0.161750 0.169437 1.000000 -0.163962
Perceptions of corruption -0.421140 -0.342337 -0.203207 -0.364374 -0.401363 -0.163962 1.000000
In [16]:
sns.scatterplot(data=df,x='Logged GDP per capita',y='Ladder score',hue='Regional indicator',s=200)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
In [9]:
px.scatter(df,x='Logged GDP per capita',y='Ladder score',color='Regional indicator',hover_name='Country name')
# as we can see african countries are on the lower end whereas western european countries are on the upper end
# Latin american and Carribean countries have above average happniess
# Afghanistan seems very far off from its regional cluster
In [23]:
#regional GDP contribution
gdp=df.groupby('Regional indicator')['Logged GDP per capita'].sum().reset_index()
gdp
Out[23]:
Regional indicator Logged GDP per capita
0 Central and Eastern Europe 171.854
1 Commonwealth of Independent States 112.822
2 East Asia 62.206
3 Latin America and Caribbean 187.400
4 Middle East and North Africa 164.324
5 North America and ANZ 43.238
6 South Asia 60.778
7 Southeast Asia 84.793
8 Sub-Saharan Africa 290.707
9 Western Europe 227.277
In [25]:
px.pie(gdp,names='Regional indicator',values='Logged GDP per capita')
#subharan africa and western european countries are contributing the most
In [30]:
#total countries from each region
df['Regional indicator'].value_counts()
#as we can see this relates directly to the previous chart since we have african countries that are most prevalent in this dataset
Out[30]:
Sub-Saharan Africa                    36
Western Europe                        21
Latin America and Caribbean           20
Middle East and North Africa          17
Central and Eastern Europe            17
Commonwealth of Independent States    12
Southeast Asia                         9
South Asia                             7
East Asia                              6
North America and ANZ                  4
Name: Regional indicator, dtype: int64
In [47]:
#corruption in regions
corrupt=df.groupby('Regional indicator')['Perceptions of corruption'].mean().round(2).reset_index().sort_values('Perceptions of corruption', ascending=False)
corrupt
Out[47]:
Regional indicator Perceptions of corruption
0 Central and Eastern Europe 0.85
6 South Asia 0.80
3 Latin America and Caribbean 0.79
8 Sub-Saharan Africa 0.77
4 Middle East and North Africa 0.76
1 Commonwealth of Independent States 0.73
7 Southeast Asia 0.71
2 East Asia 0.68
9 Western Europe 0.52
5 North America and ANZ 0.45
In [50]:
ax=sns.barplot(data=corrupt,x='Regional indicator',y='Perceptions of corruption')
ax.bar_label(container=ax.containers[0],labels=corrupt['Perceptions of corruption'])
plt.xticks(rotation='vertical')
plt.show()
In [88]:
top10
Out[88]:
Country name Regional indicator Ladder score Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption
2 Switzerland Western Europe 7.571 11.117 0.942 74.4 0.919 0.025 0.292
8 New Zealand North America and ANZ 7.277 10.643 0.948 73.4 0.929 0.134 0.242
5 Norway Western Europe 7.392 11.053 0.954 73.3 0.960 0.093 0.270
9 Austria Western Europe 7.268 10.906 0.934 73.3 0.908 0.042 0.481
3 Iceland Western Europe 7.554 10.878 0.983 73.0 0.955 0.160 0.673
1 Denmark Western Europe 7.620 10.933 0.954 72.7 0.946 0.030 0.179
6 Sweden Western Europe 7.363 10.867 0.934 72.7 0.945 0.086 0.237
7 Luxembourg Western Europe 7.324 11.647 0.908 72.6 0.907 -0.034 0.386
4 Netherlands Western Europe 7.464 10.932 0.942 72.4 0.913 0.175 0.338
0 Finland Western Europe 7.842 10.775 0.954 72.0 0.949 -0.098 0.186
In [94]:
#life expectancy of top and bottom 10 countries
top10=df.head(10)
top10=top10.sort_values('Healthy life expectancy',ascending=False)
bot10=df.tail(10)
bot10=bot10.sort_values('Healthy life expectancy',ascending=False)

fig, axes=plt.subplots(1,2)

xlabels=top10['Country name']
t=sns.barplot(data=top10,x='Country name',y='Healthy life expectancy',ax=axes[0])
t.bar_label(container=t.containers[0],labels=top10['Healthy life expectancy'])
axes[0].set_xticklabels(xlabels,rotation=30)
axes[0].set_ylim(71,75)
axes[0].set_title('Top 10 Happy Countries')

xlabels=bot10['Country name']
b=sns.barplot(data=bot10,x='Country name',y='Healthy life expectancy',ax=axes[1])
b.bar_label(container=b.containers[0],labels=bot10['Healthy life expectancy'])
axes[1].set_xticklabels(xlabels,rotation=30)
axes[1].set_ylim(45,65)
axes[1].set_title('Bottom 10 Happy Countries')

plt.show()

#top 10 countries have ages all above 70 with a smaller ranger showing more consistency
#bottom 10 countries have most expectancies between 50-60
In [95]:
sns.pairplot(df,hue='Regional indicator')
#as we can see with the majority of relationships Western Europe tends to have the best position
#this is the inverse relationship with sub-saharan african countries
Out[95]:
<seaborn.axisgrid.PairGrid at 0x1ea28bdf640>
In [96]:
df.head()
Out[96]:
Country name Regional indicator Ladder score Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption
0 Finland Western Europe 7.842 10.775 0.954 72.0 0.949 -0.098 0.186
1 Denmark Western Europe 7.620 10.933 0.954 72.7 0.946 0.030 0.179
2 Switzerland Western Europe 7.571 11.117 0.942 74.4 0.919 0.025 0.292
3 Iceland Western Europe 7.554 10.878 0.983 73.0 0.955 0.160 0.673
4 Netherlands Western Europe 7.464 10.932 0.942 72.4 0.913 0.175 0.338
In [128]:
#lets compare countries wrt. perception of corruption
corrtop=df.sort_values('Perceptions of corruption').head(10)
corrbot=df.sort_values('Perceptions of corruption',ascending=False).head(10)

fig,axes=plt.subplots(1,2)

xlabels=corrtop['Country name']
a=sns.barplot(data=corrtop,x='Country name',y='Perceptions of corruption',ax=axes[0])
a.bar_label(container=a.containers[0],labels=corrtop['Perceptions of corruption'])
axes[0].set_xticklabels(xlabels,rotation=90)
axes[0].set_title('Top 10 Least Corrupt')

xlabels=corrbot['Country name']
b=sns.barplot(data=corrbot,x='Country name',y='Perceptions of corruption',ax=axes[1])
b.bar_label(container=b.containers[0],labels=corrbot['Perceptions of corruption'])
axes[1].set_xticklabels(xlabels,rotation=90)
axes[1].set_title('Top 10 Most Corrupt')
plt.ylim(0.9,0.95)

plt.show()

print(corrtop['Regional indicator'].value_counts())
print(corrbot['Regional indicator'].value_counts())

#Singapore is the least corrupt by a large margin, however most least-corrupt countries are from western Europe
# Central and Eastern Europe has the highest number of corrupt countries, and all countries and are above 0.9 with a small range
#referring back to our pairplot, the higher the corruption the lower the countries happiness
Western Europe           7
Sub-Saharan Africa       1
Southeast Asia           1
North America and ANZ    1
Name: Regional indicator, dtype: int64
Central and Eastern Europe            6
Commonwealth of Independent States    2
South Asia                            1
Sub-Saharan Africa                    1
Name: Regional indicator, dtype: int64